#!/usr/bin/env perl

=head1 LICENSE

Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2019] EMBL-European Bioinformatics Institute

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=cut


=head1 CONTACT

 Please email comments or questions to the public Ensembl
 developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.

 Questions may also be sent to the Ensembl help desk at
 <http://www.ensembl.org/Help/Contact>.

=cut

=head2 import_clinvar_xml

  - parse ClinVar XML and import data for dbSNP or dbVar variants
  - add the short variant data if ClinVar is releasing ahead of dbSNP
  - only import ClinVar records for structural variants already in ensembl
  - use the positions of the features held in ensembl where possible

  - attribs entered
  - review_status    = Status
  - external_id      = Acc
  - clinvar_clin_sig = Desc
  - risk_allele      = HGVS allele (dbSNP variants only)

=cut

use strict;
use warnings;
use Getopt::Long;
use Data::Dumper;
use Date::Manip::Date;
use XML::LibXML::Reader;
use XML::Hash::XS qw();

use Bio::EnsEMBL::Variation::Utils::Sequence qw( get_hgvs_alleles);
use Bio::EnsEMBL::Variation::Utils::SpecialChar qw(replace_char);
use Bio::EnsEMBL::Variation::VariationFeature;
use Bio::EnsEMBL::Slice;
use Bio::DB::Fasta;
use Bio::EnsEMBL::Registry;

our $DEBUG = 0;

my ($data_file, $registry_file, $assembly, $structvar, $done_file, $clean);

GetOptions ("data_file=s"  => \$data_file,
            "registry=s"   => \$registry_file,
            "assembly=s"   => \$assembly,
            "structvar"    => \$structvar, 
            "done_file=s"  => \$done_file,
            "debug"        => \$DEBUG,
            "clean"        => \$clean 
            );

usage() unless defined $data_file && defined $registry_file && defined $assembly;
die "File not found: $data_file\n" unless -e $data_file;

my $reg = 'Bio::EnsEMBL::Registry';
$reg->load_all($registry_file);
my $dba = $reg->get_DBAdaptor('homo_sapiens','variation');
## only merging records by name, so include fails
$dba->include_failed_variations(1);
my $dbh = $dba->dbc;

my $variation_adaptor     = $dba->get_VariationAdaptor('human', 'variation', );
my $var_feat_adaptor      = $dba->get_VariationFeatureAdaptor('human', 'variation', );
my $allele_adaptor        = $reg->get_adaptor('homo_sapiens', 'variation', 'allele');
my $structvar_adaptor     = $dba->get_StructuralVariationAdaptor('human', 'variation', );

my $pheno_feat_adaptor    = $reg->get_adaptor('homo_sapiens', 'variation', 'phenotypefeature');
my $phenotype_adaptor     = $reg->get_adaptor('homo_sapiens', 'variation', 'phenotype');

my $slice_adaptor         = $reg->get_adaptor('homo_sapiens', 'core', 'slice');

## fetch ClinVar source object
my $omimstr= 'OMIM';
my $source      = get_source('ClinVar');
my $omim_source = get_source($omimstr);

## pre-load submitter ids - shortcutting the API for speed
my $submitters = get_submitter_ids($dba);

## remove old data
clean_old_data() if defined $clean; 

## OMIM set id
my $omim_set_id   = get_set($dbh, 'ph_omim');
my %omimVarSet;

## default ClinVar phenotype description if non available
my $default_pheno  = "ClinVar: phenotype not specified";
my $haplotype_type = "Haplotype";

## handle incomplete runs - takes list of ClinVar RC accessions 
my %done;
if (defined $done_file){
  open my $donelist, $done_file ||die "Unable to open list of variants already entered : $!\n";
  while(<$donelist>){
    my $id = (split)[1];
    $done{$id} = 1;
  }
  close $donelist;
}


## parse file
my $data_file_fh;
if ($data_file =~/.gz/) {
  open ($data_file_fh, '<:gzip', $data_file )
    or die "Cannot read file $data_file: $!\n"; #.xml.gz
} else {
  open ( $data_file_fh, '<', $data_file )
    or die "Cannot read file $data_file: $!\n"; #.xml
}

my $reader = XML::LibXML::Reader->new(IO => $data_file_fh);

my $step=1;
while($reader->read) {
  next unless $reader->nodeType == XML_READER_TYPE_ELEMENT;
  next unless $reader->name eq 'ReleaseSet' || $reader->name eq 'ClinVarSet';

  #get ReleaseSet date from within file
  if ($reader->name eq 'ReleaseSet') {
    my $releaseDate = $reader->getAttribute('Dated'); #2019-04-04
    if (defined $releaseDate) {
      my $versionDate = new Date::Manip::Date;
      $versionDate->parse($releaseDate);

      #update ClinVar and OMIM source version with ClinVar xml version
      $source->version($versionDate->printf("%Y%m")); #201904
      save_source_version($source);
      $omim_source->version($source->version);
      save_source_version($omim_source);
    } else {
      print "WARNING: no source version found in input xml therefore no update to db source version.\n";
    }
  } else {
    my $xml = $reader->readOuterXml;
    my $conv = XML::Hash::XS->new(utf8 => 0, encoding => 'utf-8');
    my $set  = $conv->xml2hash($xml, encoding => 'latin1'); #nice encoding of special char
    process_clinvar_set($set);
  }

}

close ($data_file_fh);


sub process_clinvar_set {
  my $set = shift;

  ## dump current structure on exit
  my $current = Dumper $set;

  my %record;
  eval{
    ## get accesion & version
    $record{Acc} = get_accession($set->{ReferenceClinVarAssertion}->{ClinVarAccession});
    warn "processing $record{Acc}\n" if $DEBUG == 1;

    ## recover from partial loads with file of accessions already loaded
    next if $done{$record{Acc}};

    ## clinical significance
    $record{Desc} = get_clinsig ($set->{ReferenceClinVarAssertion}->{ClinicalSignificance});

    ## confidence of assertation
    $record{Status} = $set->{ReferenceClinVarAssertion}->{ClinicalSignificance}->{ReviewStatus};

    ## date of assertion
    $record{DateLastEvaluated} =  $set->{ReferenceClinVarAssertion}->{ClinicalSignificance}->{DateLastEvaluated};

    ## check for somatic, Autosomal dominant, etc.
    $record{inheritance_type} = get_inheritance($set->{ReferenceClinVarAssertion}->{AttributeSet});

    ## trait info (using Acc for error logging)
    ($record{disease}, $record{ontology_accession}) = get_disease($set->{ReferenceClinVarAssertion}->{TraitSet}->{Trait}, $record{Acc});

    ## extract arrayref of PMIDs for citations
    $record{citations} = get_citations($set->{ReferenceClinVarAssertion}->{ObservedIn} );

    ## Variant name, HGVS, OMIM id and location
    print "WARNING: No MeasureSet found for $record{Acc}.\n" if !defined $set->{ReferenceClinVarAssertion}->{MeasureSet};
    print "WARNING: Found GenotypeSet(type: $set->{ReferenceClinVarAssertion}->{GenotypeSet}->{Type}) found for $record{Acc}.\n" if defined $set->{ReferenceClinVarAssertion}->{GenotypeSet};
    next unless defined $set->{ReferenceClinVarAssertion}->{MeasureSet};
    next unless defined $set->{ReferenceClinVarAssertion}->{MeasureSet}->{Type} &&
                $set->{ReferenceClinVarAssertion}->{MeasureSet}->{Type} eq "Variant" ||
                $set->{ReferenceClinVarAssertion}->{MeasureSet}->{Type} eq $haplotype_type;
    $record{feature_info} = get_feature($set->{ReferenceClinVarAssertion}->{MeasureSet}->{Measure}, $record{Acc},$set->{ReferenceClinVarAssertion}->{MeasureSet}->{Type});

    $record{submitters} = get_submitters($set->{ClinVarAssertion}, $record{feature_info});

    if (defined $structvar ){
      if( defined $record{feature_info}->{dbVar} && $record{feature_info}->{dbVar}->[0] =~/\d+/ ){
        warn "Importing SV :  $record{feature_info}->{dbVar}->[0]\n" if $DEBUG == 1;
        import( \%record);
      }
    }
    else{
      if( exists $record{feature_info}->{dbSNP} && $record{feature_info}->{dbSNP}->[0] =~/\d+/ ){
         warn "Importing Var :  $record{feature_info}->{dbSNP}->[0] ($record{Acc})\n" if $DEBUG == 1;
         import( \%record);
      }
      else{
    	my $message =  "Not importing var: ";
    	$message .= " rs: $record{feature_info}->{dbSNP} "        if defined $record{feature_info}->{dbSNP} ;
    	$message .= " on chr: $record{feature_info}->{Chr} "      if defined $record{feature_info}->{Chr} ;
    	$message .= " with HGVS: $record{feature_info}->{hgvs_g}" if defined $record{feature_info}->{hgvs_g} ;
    	$message .= " due to missing data ($record{Acc})\n";
    	warn $message if $DEBUG == 1;
      }
    }
  };
  if( $@ ne ''){
      #die "ERROR: $@\n";
      die "ERROR: $@\n$current\n\n";
  } 
}

## find old set id ( and remove linked variants)
##  or enter new one
sub get_set{

  my $dbh = shift;
  my $set = shift; ## short attrib name for set

  ## info on sets
  my $data =      {
    'ph_omim'           => {
      'desc' => 'Variants linked to entries in the Online Mendelian Inheritance in Man (OMIM) database',
      'name' => 'OMIM phenotype variants'},
  };

  my $attrib_ext_sth = $dbh->prepare(qq[ select attrib_id from attrib
                                       where value = ? and attrib_type_id = 477]);

  my $set_ext_sth = $dbh->prepare(qq[ select variation_set_id
                                      from variation_set, attrib
                                      where variation_set.short_name_attrib_id = attrib.attrib_id
                                      and attrib.value =? ]);

  my $set_ins_sth = $dbh->prepare(qq[ insert into variation_set
                                     (name, description, short_name_attrib_id)
                                      values ( ?,?,?  ) ]);

  my $set_del_sth = $dbh->prepare(qq[ delete from variation_set_variation where variation_set_id = ?]);

  ### look for old set record
  $set_ext_sth->execute( $set);
  my $set_id = $set_ext_sth->fetchall_arrayref();

  if (defined $set_id->[0]->[0] ){
    ## remove old set content
    $set_del_sth->execute($set_id->[0]->[0]) if defined $clean;
    return $set_id->[0]->[0] ;
  }

  ### enter new set record
  $attrib_ext_sth->execute( $set);
  my $attrib_id = $attrib_ext_sth->fetchall_arrayref();
  die "Exiting: attrib not available for $set\n" unless defined $attrib_id->[0]->[0];

  $set_ins_sth->execute( $data->{$set}->{name}, $data->{$set}->{desc}, $attrib_id->[0]->[0] );
  $set_id = $dbh->db_handle->last_insert_id(undef, undef, qw(variation_set set_id))|| die "no insert id for set $set\n";

  return $set_id;

}

## get ClinVar accession & version
sub get_accession{

  my $ClinVarAccession = shift;

  my $accession =  $ClinVarAccession->{Acc} .".".  $ClinVarAccession->{Version} ;

  return $accession;
}

## clean clinical significance
sub get_clinsig{

  my $ClinicalSignificance = shift;
  my $desc;

  if ($ClinicalSignificance->{Description} =~/conflict/){
    ## Description = 'conflicting data from submitters' - the values are in the explanation
    defined $ClinicalSignificance->{Explanation} ?
    $desc = "\L$ClinicalSignificance->{Explanation}->[0]" :
    warn "warning: conflicting ClinicalSignificance but no Explanation\n";
    $desc |='';
    $desc =~ s/\(\d+\)//g; ## remove bracketed counts
    $desc =~ s/\;/\,/g;    ## switch to comma delimited for set
  }
  else{
    $desc = "\L$ClinicalSignificance->{Description}";
  }
  return $desc;
}

=head2 get_inheritance
- mode of inheritance attribute holds somatic status
=cut
sub get_inheritance{

  my $attribute_set = shift;

  my $moi;

  my $attributes = to_array($attribute_set);
  foreach my $attribute(@{$attributes}){
    next unless defined $attribute->{Attribute} &&
                      $attribute->{Attribute}->{Type} eq 'ModeOfInheritance';

    $moi = $attribute->{Attribute}->{content};
  }
  return $moi;
}

=head2 get_feature

- get variant/ structural variant info + location on required assembly
- inlude OMIM ids and HGVS for short variants 
=cut

sub get_feature{

  my $Measure   = shift;
  my $accession = shift;
  my $type      = shift;

  my %feature;

  my $measures = to_array($Measure);
  foreach my $measure(@{$measures}){

#    next unless(ref($measure) eq 'HASH'); ##
    if(ref($measure) ne 'HASH'){
      warn "Multiple measures for $accession - not loading\n";
      next;
    } 

    ## dbSNP/ dbVAR and OMIM ids
    if(defined $measure->{XRef}){
      my $xref_set = to_array($measure->{XRef});
      foreach my $xref( @{$xref_set} ){
        next unless ref($xref) eq  'HASH';
        if (defined $xref->{Type}) {
          push @{$feature{measureXrefs}{$measure->{ID}}{$xref->{DB}}{$xref->{Type}} }, $xref->{ID} ;
        } else {
          $xref->{DB} eq 'dbVar' ?
          push @{$feature{$xref->{DB}}},$xref->{ID} :
          push @{$feature{measureXrefs}{$measure->{ID}}{$xref->{DB}} }, $xref->{ID} ;
        }
      }
      if (defined $feature{measureXrefs}{$measure->{ID}}{$omimstr} &&
      defined $feature{measureXrefs}{$measure->{ID}}{$omimstr}{'Allelic variant'} &&
      defined $feature{measureXrefs}{$measure->{ID}}{'dbSNP'}){
        my %tmpAllelicID;
        foreach my $allele (@{$feature{measureXrefs}{$measure->{ID}}{$omimstr}{'Allelic variant'}}){
          foreach my $tmpRS (@{$feature{measureXrefs}{$measure->{ID}}{'dbSNP'}{'rs'}}){
            push @{$tmpAllelicID{$allele}}, $tmpRS;
          }
        }
        # for haplotype entries: allelic variant ids are not real synonyms & this will result in them not being saved
        $feature{measureXrefs}{$measure->{ID}}{OmimAllele2dbSNP} = \%tmpAllelicID unless $type eq $haplotype_type;
      }
      push @{$feature{'haplo'}{'rs'}}, @{$feature{measureXrefs}{$measure->{ID}}{'dbSNP'}{'rs'}} if $type eq $haplotype_type && defined $feature{measureXrefs}{$measure->{ID}}{'dbSNP'};
   }

    ## position on required assembly
    next unless defined $measure->{SequenceLocation};
    my $seqLocs = to_array($measure->{SequenceLocation});
    foreach my $loc(@{$seqLocs}){

      next unless ref($loc) eq  'HASH'; 
      next unless $loc->{Assembly} eq $assembly;

      $feature{Chr}    = $loc->{Chr} ; 
      $feature{start}  = $loc->{start} ; 
      $feature{end}    = $loc->{stop} ; 

      $feature{measureXrefs}{$measure->{ID}}{Chr}   = $loc->{Chr}    if $type eq $haplotype_type;
      $feature{measureXrefs}{$measure->{ID}}{start} = $loc->{start}  if $type eq $haplotype_type;
      $feature{measureXrefs}{$measure->{ID}}{end}   = $loc->{stop}   if $type eq $haplotype_type;
    }


    ## HGVS genomic - used for allele extraction for novel variants 
    my $assembly_number  = $assembly; ## uses assembly without GRCh
    $assembly_number     =~ s/\D+//;

    my $attrib_set = to_array($measure->{AttributeSet});
    foreach my $attrib (@{$attrib_set}){

      next unless (defined $attrib->{Attribute}->{integerValue} &&
                           $attrib->{Attribute}->{integerValue} == $assembly_number );

      next unless (defined $attrib->{Attribute}->{Type} &&
                           $attrib->{Attribute}->{Type} =~ /HGVS,\s+genomic,\s+top\s+level/ );
	
      $feature{hgvs_g} =  $attrib->{Attribute}->{Change};
      $feature{measureXrefs}{$measure->{ID}}{hgvs_g} = $attrib->{Attribute}->{Change} if $type eq $haplotype_type;
    }

    ## find reported genes, note: each measure set can have a MeasureRelationship gene
    if(defined $measure->{MeasureRelationship}){
      my @genes;          
      my $meas_rels = to_array($measure->{MeasureRelationship});
      foreach my $meas (@{$meas_rels}){
        push @genes, $meas->{Symbol}->{ElementValue}->{content} if $meas->{Symbol}->{ElementValue}->{content} =~ /\w/;
      }
      $feature{gene} = join(",", @genes);
      $feature{measureXrefs}{$measure->{ID}}{gene} = $feature{gene} if $type eq $haplotype_type;
    }

    #last processing of this specific measure
    # if multiple rsIDs for the same measure set in ReferenceClinVarAssertion print warning
    warn "multiple rsIDs for same measure ($measure->{ID}) in RCV ($accession), rsIDs: ".join(",",@{$feature{measureXrefs}{$measure->{ID}}{dbSNP}{rs}} )."\n" if (defined $feature{measureXrefs}{$measure->{ID}}{dbSNP} && scalar @{$feature{measureXrefs}{$measure->{ID}}{dbSNP}{rs}} >1 );

    #populate higher level hash of rsIDs and specific coordinates, hgvs_g
    if (defined $feature{'haplo'} && defined $feature{measureXrefs}{$measure->{ID}}{'dbSNP'} ){
      foreach my $rsID (@{$feature{measureXrefs}{$measure->{ID}}{'dbSNP'}{'rs'}}){
        my $varID = 'rs'.$rsID;
        #check if the existing record has same values as the latest measure set: {measureXrefs}{$measure->{ID}
        if (defined $feature{$varID} &&
            (defined $feature{$varID}{'Chr'} &&
             $feature{$varID}{Chr} != $feature{measureXrefs}{$measure->{ID}}{Chr} ||
             $feature{$varID}{start} != $feature{measureXrefs}{$measure->{ID}}{start} ||
             $feature{$varID}{end} != $feature{measureXrefs}{$measure->{ID}}{end} ||
             $feature{$varID}{hgvs_g} != $feature{measureXrefs}{$measure->{ID}}{hgvs_g}) ){

              # these phenotypes will end up not being imported unless the rsID already exists
             print "WARNING: removing location/hgvs for rsID with multiple locations/hgvs, as either of them can be correct: rs$rsID\n";
             delete @{$feature{$varID}}{qw/Chr start end hgvs_g gene/}; # $feature{'rs'.$rsID} left in palce as a mark
             delete @feature{qw/Chr start end hgvs_g gene/};
        } else {
          @{$feature{$varID}}{qw/Chr start end hgvs_g gene/} = @{$feature{measureXrefs}{$measure->{ID}}}{qw/Chr start end hgvs_g gene/};
        }
      }
    }
  }

  # if haplotype present then save corresponding rsIDs for phenotype_feature_attrib entry
  if ($type eq $haplotype_type && defined $feature{haplo} && defined $feature{haplo}{rs} ) {
    my @rsIDs = keys {map { $_ => 1 } @{$feature{haplo}{rs}}};
    $feature{haplo}{rs} = \@rsIDs;
    # only if more than 1 rsID present make them phenotype_feature_attribs
    if( scalar @rsIDs >1 && (scalar keys $feature{measureXrefs}  == scalar @rsIDs) ){
      my %index;
      @index{@rsIDs} = (0..$#rsIDs);
      foreach my $rs (@{$feature{haplo}{rs}}){
        foreach my $rs2 (@{$feature{haplo}{rs}}){
          next if $rs eq $rs2;
          push @{$feature{haplo}{'rs'.$rs}}, 'rs'.$rs2;
        }
      }
      #remove global details as multiple rsIDs present, each with own coordinates and hgvs_g
      delete @feature{qw/Chr start end hgvs_g gene/};
    } else {
      delete $feature{haplo};
    }
  }

  return \%feature;
}

=head2 get_disease

- extract prefered disease name & ontology terms
- only take first disease but report if more are present
 
=cut
sub get_disease{

  my ($Trait, $accession) = @_;

  my ($disease, $ontology_accession);

  my $traits = to_array($Trait);
  if(scalar(@{$traits}) > 1){ warn "Multiple traits for $accession\n";}

  foreach my $trait (@{$traits}){

    next if defined $disease; ## How should multi-disease assertions be handled? - log to monitor

    my $names = to_array($trait->{Name});

    foreach my $name ( @{$names} ){

      next unless $name->{ElementValue}->{Type} eq "Preferred";
 
      $disease =  $name->{ElementValue}->{content};

      my $xrefs = to_array($name->{XRef});
      my @ontology_accs;
      foreach my $xref (@{$xrefs}){
        next unless $xref->{DB};
        push(@ontology_accs, $xref->{ID})
          if $xref->{DB} eq "Human Phenotype Ontology";

        push(@ontology_accs, 'Orphanet:' .$xref->{ID})
          if $xref->{DB} eq "Orphanet";
      }
      $ontology_accession = $ontology_accs[0]; #only first ontlogy accession will be returned
     }
  }
  return ($disease, $ontology_accession);
}

=head2 get_citations

- extract any pubmed ids supporting this ascertation
 
=cut
sub get_citations{

  my $structure = shift;

  my @citations;
 
  my $observed_in = to_array($structure );
  foreach my $observed_in (@{$observed_in}){

    my $observed_data = to_array($observed_in->{ObservedData});
    foreach my $obs( @{$observed_data} ){

      if ($obs->{Citation}){
        my $citations = to_array($obs->{Citation});
        foreach my $cit(@{$citations}){
          my $ids = to_array($cit->{ID});
          foreach my $id (@{$ids}){
            push @citations, $id->{content} if $id->{Source} && $id->{Source} eq 'PubMed';
          }
        }
      }
    }
  }
  return \@citations;
}

=head2 get_submitters

- extract any submitters of assertations from the ClinVar release

=cut
sub get_submitters{

  my $structure = shift;
  my $mainXrefs = shift;

  my @submitters;

  my $assertions = to_array($structure);
  foreach my $assert(@{$assertions}){
    push @submitters, $assert->{ClinVarSubmissionID}->{submitter};
    # if submitter OMIM then get OMIM allelic variant ID and corresponding rsID from reference record
    next unless $assert->{ClinVarSubmissionID}->{submitter} eq $omimstr;
    if (defined $assert->{ExternalID} && $assert->{ExternalID}->{DB} eq $omimstr) {
      my $omimAlleleID = $assert->{ExternalID}->{ID};
      warn "OMIM submitter with ExternalID BUT not for variation type, assertion(", $assert->{ClinVarAccession}->{Acc}, ") id type(",$assert->{ExternalID}->{Type} ,")!!\n" if $DEBUG == 1 &&  $assert->{ExternalID}->{Type} ne 'Allelic variant';
      #iterate over the measureSet xrefs saved from the ReferenceClinVarAssertion, in case of multiple sets of xref, the one with the expected OMIM submitter allele is used
      foreach my $tmpMeasure (keys %{$mainXrefs->{'measureXrefs'}}) {
        if (defined $mainXrefs->{'measureXrefs'}{$tmpMeasure} &&
            defined $mainXrefs->{'measureXrefs'}{$tmpMeasure}{'OmimAllele2dbSNP'} &&
            defined $mainXrefs->{'measureXrefs'}{$tmpMeasure}{'OmimAllele2dbSNP'}{$omimAlleleID}) {
          if (scalar @{$mainXrefs->{'measureXrefs'}{$tmpMeasure}{'OmimAllele2dbSNP'}{$omimAlleleID}} > 1) {
            warn "multiple rsIDs for same OMIM allelic variant id in same measure: $tmpMeasure, rs:", join(",", @{$mainXrefs->{'measureXrefs'}{$tmpMeasure}{'OmimAllele2dbSNP'}{$omimAlleleID}}), "!\n";
          }
          $mainXrefs->{$omimstr} ||= $omimAlleleID;
          $mainXrefs->{'MIM'} = (split/\./, $omimAlleleID)[0] unless defined $mainXrefs->{'MIM'};
          # if there is a OMIM submitter record, then use the OMIM alleleID and corresponding rsID for the record (for haplotype records multiple Omim allelic ids can be mentioned in the ReferenceClinVarAssertion measures)
          $mainXrefs->{'dbSNP'} = $mainXrefs->{'measureXrefs'}{$tmpMeasure}{'OmimAllele2dbSNP'}{$omimAlleleID} unless defined $mainXrefs->{'dbSNP'};
        }
      }
      $mainXrefs->{'MIM'} = (split/\./, $omimAlleleID)[0] unless defined $mainXrefs->{'MIM'};
    } elsif ($DEBUG == 1){
      warn "OMIM submitter but no ExternalID, assertion(", $assert->{ClinVarAccession}->{Acc}, ") gene(",$mainXrefs->{gene} ,")!!\n";
    }
  }

  #if no OMIM submitter than make sure rsID from dbSNP is at upper hash level
  if (! defined $mainXrefs->{'dbSNP'}){
    foreach my $tmpMeasure (keys %{$mainXrefs->{'measureXrefs'}}) {
      push @{$mainXrefs->{'dbSNP'}}, @{$mainXrefs->{'measureXrefs'}{$tmpMeasure}{'dbSNP'}{'rs'}} if defined $mainXrefs->{'measureXrefs'}{$tmpMeasure}{'dbSNP'} && defined $mainXrefs->{'measureXrefs'}{$tmpMeasure}{'dbSNP'}{'rs'};
    }
    # make sure the list contains unique rsIDs to deal with haplotypes/records encountered with multiple sets using same rsID
    @{$mainXrefs->{'dbSNP'}} = keys {map { $_ => 1 } @{$mainXrefs->{'dbSNP'}} } if defined $mainXrefs->{'dbSNP'};
  }

  return \@submitters;
}

sub to_array{

  my $structure = shift;

  return undef unless defined $structure;

  my @array;
  if (ref($structure ) eq 'ARRAY'){
    @array = @{$structure};
  }
  else{
    push @array, $structure;
  }
  return \@array;
}



=head2  import

- check all required info present & update db with a single ClinVar
 
=cut

sub import{

  my $record = shift;

  my $feature_object;
  my $alt_allele; ## disease associated allele from HGVS
  my $feat;       ## use stored *variation_features where possible

  if(defined $record->{feature_info}->{dbSNP} && !defined $structvar){
    foreach my $rs (@{ $record->{feature_info}->{dbSNP} }){
      my $rsID = 'rs'.$rs;
      # for haplotypes (multiple rsIDs) each rsID has it's own hgvs_g string
      if (defined $record->{feature_info}->{$rsID}){
        @{$record->{feature_info}}{qw/hgvs_g Chr start end gene/} = @{$record->{feature_info}->{$rsID}}{qw/hgvs_g Chr start end gene/};
      }

      ($feature_object,  $feat, $alt_allele) = get_variant($record, $rs);
      next unless defined $feature_object;

      ## add synonym
      add_synonyms($feature_object, $record->{Acc}, $source) if defined $feature_object;
      add_synonyms($feature_object, $record->{feature_info}{$omimstr}, $omim_source) if defined $feature_object && defined $record->{feature_info}{$omimstr};

      ## add phenotype_feature & attrib (there may not be an alt_allele)
      import_phenotype_feature($record, $feature_object, 'Variation', $feat, $alt_allele);
    }
  }
  elsif(defined  $record->{feature_info}->{dbVar}){
    ($feature_object, $feat) = get_structural_variant($record);

    if ( defined $feature_object){
      ## add phenotype_feature & attrib
      import_phenotype_feature($record, $feature_object, 'StructuralVariation', $feat, $alt_allele);
    }
  }
  else{
      warn "Can't import ". $record->{Acc} ." as no xref\n";
  }
}

sub import_phenotype_feature{

  my $record         = shift;
  my $feature_object = shift;
  my $type           = shift;
  my $feat           = shift;
  my $alt_allele     = shift;

  ## deal with non-specified phenos
  $record->{disease} = $default_pheno unless $record->{disease} =~/\w+/;
  $record->{disease} = $default_pheno if $record->{disease} eq "not provided";
  $record->{disease} = $default_pheno if $record->{disease} eq "not specified";

  # Remove special characters from the phenotype description and submitter name
  $record->{disease} = replace_char( $record->{disease});
  foreach my $sub (@{$record->{submitters}}){
    $sub = replace_char( $sub);
  }

  ## look for existing or enter new phenotype object
  my $pheno = get_phenotype($record->{disease}, $record->{ontology_accession});

  my %attribs;
  $attribs{review_status}    = $record->{Status};
  $attribs{external_id}      = $record->{Acc};
  $attribs{clinvar_clin_sig} = $record->{Desc} if $record->{Desc} ne ''; #avoids empty entry if explanation is missing for conflicting evidence
  $attribs{risk_allele}      = $alt_allele if defined $alt_allele && $alt_allele ne "-" && $alt_allele ne '';
  $attribs{associated_gene}  = $record->{feature_info}->{gene} if defined $record->{feature_info}->{gene};
  $attribs{MIM}              = $record->{feature_info}->{MIM} if defined $record->{feature_info}->{MIM};
  $attribs{pubmed_id}        = join(",", @{$record->{citations}}) if $record->{citations} && exists $record->{citations}->[0];
  if (defined $attribs{pubmed_id} && length($attribs{pubmed_id}) > 255) {
    $attribs{pubmed_id}      = substr($attribs{pubmed_id}, 0, 255);
    $attribs{pubmed_id}      = substr($attribs{pubmed_id}, 0,rindex($attribs{pubmed_id}, ","));
  }
  $attribs{inheritance_type} = $record->{inheritance_type} if defined $record->{inheritance_type};
  $attribs{DateLastEvaluated} = $record->{DateLastEvaluated} if defined $record->{DateLastEvaluated};
  $attribs{variation_names}  = join(",", @{$record->{feature_info}->{haplo}->{$feature_object->name}}) if defined $record->{feature_info}->{haplo} && defined $record->{feature_info}->{haplo}->{$feature_object->name};
  my %submitter_ids;
  foreach my $sub (@{$record->{submitters}}){
    ##enter submitter unless already available
    unless ($submitters->{$sub}){
      $submitters->{$sub} = add_submitter($sub);
      warn "Added submitter $sub id " . $submitters->{$sub}  ." for sub\n";
    }
    $submitter_ids{ $submitters->{$sub} } = 1;
  }
  $attribs{submitter_id} = join(",", keys %submitter_ids) if (keys %submitter_ids) >0;

  update_variation_set($feature_object->dbID()) if $type ne 'StructuralVariation' && exists $submitters->{$omimstr} && exists $submitter_ids{$submitters->{$omimstr}} && ! defined $omimVarSet{$feature_object->dbID()}; #if OMIM is a submitter save the variation in the ph_omim variation_set_variation

  foreach my $feature (@{$feat}){ #feat is the VariationFeature which contains the genome coordonates for this variation
    print "entering phenotype_feature type : $type & object id: ".  $feature_object->dbID() . ", position " .$feature->seq_region_start() . "-".  $feature->seq_region_end() . "\n"
      if $DEBUG == 1;

  my $phenofeat = Bio::EnsEMBL::Variation::PhenotypeFeature->new(
      -slice          => $feature->slice(),
      -start          => $feature->seq_region_start(),
      -strand         => $feature->seq_region_strand(),
      -end            => $feature->seq_region_end(),
      -phenotype      => $pheno,
      -is_significant => 1,
      -type           => $type,
      -object         => $feature_object,
      -source         => $source,
      -attribs        => \%attribs
      );
      $pheno_feat_adaptor->store($phenofeat);

  }
}


=head2 update_variation_set
- update variation set variation ph_omim for the new variation
=cut

sub update_variation_set{
  my $variation_id      = shift;

  my $vsv_ins_sth = $dbh->prepare(qq[ insert ignore into variation_set_variation
                                     (variation_id, variation_set_id)
                                      values (?,?)] );
  $vsv_ins_sth->execute( $variation_id, $omim_set_id );
  $omimVarSet{$variation_id} =1;

}

=head2 get_phenotype

 - retrieve existing or enter new phenotype object

=cut
sub get_phenotype{

  my $desc      = shift;
  my $accession = shift;

  $desc =~s /\\x2c|\\X2C/\,/g; ## decode commas
  $desc =~s /\'//g;            ## remove '

  my $pheno = $phenotype_adaptor->fetch_by_description( $desc )->[0];

  unless ( ref($pheno) eq 'Bio::EnsEMBL::Variation::Phenotype' ){

    $pheno = Bio::EnsEMBL::Variation::Phenotype->new(-description => $desc );
    $phenotype_adaptor->store($pheno);
  }

  if($accession){
    $pheno->add_ontology_accession({ accession      => $accession,
                                     mapping_source => 'Data source',
                                     mapping_type   => 'is'
                                     } );
    $phenotype_adaptor->store_ontology_accessions($pheno);
  }

  return $pheno;
}


=head2 get_submitter_ids

 - retrieve existing ids held for assertation submitters

=cut
sub get_submitter_ids{

  my $dba = shift;

  my %submitters;

  my $submitter_ext_sth = $dba->dbc->prepare(qq[ select submitter_id, description from submitter]);
  $submitter_ext_sth->execute()||die;

  my $dat = $submitter_ext_sth->fetchall_arrayref();
  foreach my $l (@{$dat}){
    $submitters{$l->[1]} = $l->[0];
  }

  return \%submitters;
}

=head2 add_submitter

 - add ids for new assertation submitters

=cut

sub add_submitter{

  my $submitter_name = shift;

  my $submitter_ins_sth = $dba->dbc->prepare(qq[ INSERT INTO submitter (description) values (?) ]);
  $submitter_ins_sth->execute($submitter_name);

  my $submitter_ext_sth = $dba->dbc->prepare(qq[ select submitter_id from submitter where description=?]);
  $submitter_ext_sth->execute($submitter_name)||die;
  my $dat = $submitter_ext_sth->fetchall_arrayref();
  warn "added submitter : $submitter_name & $dat->[0]->[0]\n";
  return $dat->[0]->[0];

}


=head2 get_variant 

  - look up or enter variation
  - returns variation & variation_feature objects & associated allele (from HGVS)

=cut
sub get_variant{

  my $record = shift;
  my $rs_id  = shift;

  my $dbSNP  = "rs" . $rs_id;

  $record->{feature_info}->{hgvs_g} |= "";
  print "Seeking $dbSNP ". $record->{feature_info}->{hgvs_g} . "\n" if $DEBUG ==1;
  ## need alleles to input for standard variation & for risk allele attribute
  ## take from HGVS string; should there be multiple rsIDs for the same feature_info, they all will use the same one hgvs_g string
  if ($record->{feature_info}->{hgvs_g} !~ /.*:.*/) {
    defined $record->{feature_info}->{Chr} && $record->{feature_info}->{hgvs_g}  ne "" ?
      $record->{feature_info}->{hgvs_g} = $record->{feature_info}->{Chr} . ":" . $record->{feature_info}->{hgvs_g} :
      ($record->{feature_info}->{hgvs_g} = "unknown" . ":" . $record->{feature_info}->{hgvs_g} );
  }

  my ($ref_allele, $alt_allele);
  eval{
    ($ref_allele, $alt_allele) = get_hgvs_alleles( $record->{feature_info}->{hgvs_g} ) unless $record->{feature_info}->{hgvs_g} eq "unknown:" ;
  };
  ## not printing bulky error message
  warn "Problem finding allele for $dbSNP\n" unless $@ eq '';

  unless (defined $ref_allele && defined  $alt_allele && $ref_allele ne $alt_allele){
    print "Ref + Alt alleles not available for $dbSNP (" . $record->{feature_info}->{hgvs_g} . ") \n";
  }


  ## look for existing variation object to return
  my $var_ob = $variation_adaptor->fetch_by_name($dbSNP);

  if (defined $var_ob){
    my @features = $var_ob->get_all_VariationFeatures();
    return ($var_ob, @features , $alt_allele);
  }
  print "No record found for $dbSNP\n" if $DEBUG ==1;



  ## ClinVar can be ahead of dbSNP - is there enough data to create a variation record?

  if( !defined $record->{feature_info}->{hgvs_g} || !defined $record->{feature_info}->{Chr} ||
      !defined $ref_allele || !defined $alt_allele ||
      (defined $ref_allele && $ref_allele eq '') ||
      (defined $alt_allele && $alt_allele eq '') ||
      (defined $ref_allele && defined $alt_allele && $ref_allele eq $alt_allele) ) {
    warn "Not entering new refSNP: $rs_id as no parsable HGVS available for alleles ($record->{feature_info}->{hgvs_g})\n";
    return undef;
  }

  ## enter new records
  my ($new_var_ob, $var_feat) = enter_var($record,  $ref_allele, $alt_allele, $dbSNP, $record->{inheritance_type});
  return ($new_var_ob,  $var_feat, $alt_allele);
}

=head2 enter_var

  - ClinVar releases more frequently than dbSNP so may have new data
  - enter variation, alleles and variation_feature
  - returns variation & variation_feature objects

=cut
sub enter_var{

  my $data       = shift;
  my $ref_allele = shift;
  my $alt_allele = shift;
  my $dbSNP      = shift;
  my $inherit    = shift;

   unless (defined $ref_allele && defined $alt_allele){
     warn "ERROR: missing alleles for $data->{feature_info}->{hgvs_g} / $dbSNP\n";
     return undef;
   }

  my $somatic = 0;
  $somatic = 1 if defined $inherit && $inherit =~ /Somatic/i;

  my $allele_str = $ref_allele ."/". $alt_allele;
  my $vf_start = $data->{feature_info}->{start};
  my $vf_end = $data->{feature_info}->{end};

  ## get slice for new variationfeature
  ## strand not reported - assumes forward
  my $slice = $slice_adaptor->fetch_by_region( 'chromosome', $data->{feature_info}->{Chr} );

  if ($ref_allele eq '-' && $allele_str ne '-/') {
    if ($vf_start == $vf_end){
      $vf_start = $vf_end+1 ;
    } elsif ( $vf_start +1 == $vf_end) {
      $vf_start = $data->{feature_info}->{end} ;
      $vf_end = $data->{feature_info}->{start} ;
    } elsif ($data->{feature_info}->{hgvs_g} =~ /\d+dup/){ #old style formated hgvs_g ..3-6dupGAGA
      $vf_start = $vf_end + 1;
    }
  } elsif ($allele_str eq '-/' && $data->{feature_info}->{hgvs_g} =~ /\d+dup/){ #new style formated hgvs_g ..3-6dup
    my $ref_slice = $slice_adaptor->fetch_by_region( 'chromosome', $data->{feature_info}->{Chr},
              $data->{feature_info}->{start}, $data->{feature_info}->{end});
    my $ref_seq = $ref_slice->seq();
    $allele_str = "-/".$ref_seq;
    $vf_start= $vf_end + 1;
  } elsif ($alt_allele =~/^$ref_allele/ && $data->{feature_info}->{hgvs_g} =~ m/\[/i ) {
    # check if HGVS was a repeat and reference already contains the a repeat and the number of inserted repeats has to be adjusted
    # eg. rs1555092425 (11:g.108282799A[5]) -> get_hgvs_alleles produces -> A(1) -> A(5) while correct is: ref AAA(3) -> alt AAAAA(5)
    my $refSlice = $slice_adaptor->fetch_by_region( 'chromosome', $data->{feature_info}->{Chr},
                    $vf_start, $vf_start + length($alt_allele) - 1);

    my @refSeq = split //, $refSlice->seq;
    my @altSeq = split //, $alt_allele;
    my ($i,$match) = (0,1);
    next unless $refSeq[0] eq $altSeq[0] ; # skip duplications that don't have start coinciding with hgvs parsed elements
    while ($i< scalar @altSeq && $match){
      $match=0 if ($refSeq[$i] ne $altSeq[$i]);
      $i++;
    }
    $ref_allele = substr($refSlice->seq, 0, $i-1);
    $allele_str = $ref_allele ."/". $alt_allele;
  }

  # case eg. NC_000009.12:g.137233961_137234061del
  elsif ($allele_str eq '/-' && $data->{feature_info}->{hgvs_g} =~ /\d+_\d+del$/){
      my ($start, $end) = $data->{feature_info}->{hgvs_g} =~ m/(\d+)_(\d+)del$/i;
      print "WARNING: HGVS contains different start/end for deletion ($data->{feature_info}->{hgvs_g})\n" if ($start ne $vf_start || $end != $vf_end );
      my $refSlice = $slice_adaptor->fetch_by_region( 'chromosome', $data->{feature_info}->{Chr},
                      $vf_start, $vf_end);
      $ref_allele = $refSlice->seq;
      $allele_str = $ref_allele ."/". $alt_allele;
  }

  # case eg. NC_000007.14:g.5978687_5978689delinsC
  elsif ($data->{feature_info}->{hgvs_g} =~ /\d+_\d+delins[A-Z]+/i){
      my ($start, $end, $alt) = $data->{feature_info}->{hgvs_g} =~ m/(\d+)_(\d+)delins([A-Z]+)$/i;
      print "WARNING: HGVS contains different start/end/alt for delins ($data->{feature_info}->{hgvs_g})\n" if ($start ne $vf_start || $end != $vf_end  || $alt ne $alt_allele);
      my $refSlice = $slice_adaptor->fetch_by_region( 'chromosome', $data->{feature_info}->{Chr},
                      $vf_start, $vf_end);
      $ref_allele = $refSlice->seq;
      $allele_str = $ref_allele ."/". $alt_allele;
  }

  my $var = Bio::EnsEMBL::Variation::Variation->new
    ( -name              => $dbSNP,
      -source            => $source,
      -is_somatic        => $somatic,
      -adaptor           => $variation_adaptor,
    );
  $variation_adaptor->store($var);

  my $vf = Bio::EnsEMBL::Variation::VariationFeature->new
    (-start           => $vf_start,
     -end             => $vf_end,
     -strand          => 1,
     -slice           => $slice,
     -variation_name  => $dbSNP,
     -map_weight      => 1,
     -allele_string   => $allele_str,
     -variation       => $var,
     -source          => $source,
     -is_somatic      => 0,
     -adaptor         => $var_feat_adaptor,
    );

  $var_feat_adaptor->store($vf);

  # save the update ref, alt allele
  foreach my $allele ( $ref_allele, $alt_allele){
    $allele =~s/\s+//;
    my $al = Bio::EnsEMBL::Variation::Allele->new
      ( -variation_id   => $var->dbID(),
        -allele         => $allele,
        -adaptor        => $allele_adaptor,
      );
    $allele_adaptor->store($al);
  }

  my @vf; #return vf to attach phenotype feature to
  push @vf, $vf;
  return ($var, \@vf);
}



=head2 get_structural_variant

  - get structural variant record to use in phenotype feature
  - new data is not entered
  - returns variation & variation_feature objects

=cut
sub get_structural_variant{

  my $record = shift;

  ## sort to get struct var not genotype
  my @ids = sort @{$record->{feature_info}->{dbVar}};
  my $dbvar = pop @ids;

  ## look for existing structural variation object
  my $struct_var_ob = $structvar_adaptor->fetch_by_name($dbvar);

  unless (defined $struct_var_ob && $struct_var_ob ne ''){

    print "Not entering SV: $record->{feature_info}->{dbVar}->[0] as not in db \n";
    return undef;
  }
  my @features = $struct_var_ob->get_all_StructuralVariationFeatures();

  return ($struct_var_ob, @features );
}

=head2 get_source

  - retrieve source object

=cut

sub get_source{
  my $source_name = shift;

  my $source_adaptor  = $reg->get_adaptor('homo_sapiens', 'variation', 'source');

  if (defined $source_name) {
    my $source = $source_adaptor->fetch_by_name( $source_name );
    die ("Source information not held for ",$source_name, "\n") unless defined $source ;
    return $source;
  }

  return $source;
}

=head2 save_source_version

  - updates source object in db

=cut

sub save_source_version {
  my $source = shift;

  my $source_adaptor  = $reg->get_adaptor('homo_sapiens', 'variation', 'source');
  $source_adaptor->update_version($source);

  return $source;
}

sub add_synonyms{

  my $var               = shift;
  my $synonym_accession = shift;
  my $synonym_source    = shift;

  if ($synonym_source->name eq 'ClinVar') {
    $synonym_accession =~ s/\.\d+$//; ##remove version for synonym
  }

  ## multiple rs id can be attached to the same ClinVar id - usually identical duplicates
  ## but we cannot support 2 variants with the same synonym/source
  my $syn_ins_sth = $dba->dbc->prepare(qq[ insert ignore into variation_synonym  
                                           (variation_id, source_id, name)  
                                           values (?,?,?)
                                          ]);

  $syn_ins_sth->execute($var->dbID(), $synonym_source->dbID(), $synonym_accession);


}

## delete previous ClinVar data
## a few entries are withdrawn each time

sub clean_old_data{

  print "Deleting old phenotype, synonym and clinical_significance data\n";

  my $phenfeatat_del_sth = $dba->dbc->do(qq[ delete from phenotype_feature_attrib where phenotype_feature_id in
                                             ( select phenotype_feature.phenotype_feature_id from phenotype_feature, source
                                               where source.name ='ClinVar' 
                                               and phenotype_feature.source_id = source.source_id)
                                            ]);

  my $phenfeat_del_sth   = $dba->dbc->do(qq[ delete from phenotype_feature where source_id in (select source_id from source where name ='ClinVar') ]);

  my $synonym_del_sth    = $dba->dbc->do(qq[ delete from variation_synonym where source_id in (select source_id from source where name ='ClinVar') ]);

  my $var_updt_sth       = $dba->dbc->do(qq[ update variation set clinical_significance=\\N ]);


}


sub usage{

    die "\n\tUsage: import_clinvar_xml -data_file [ClinVar xml] -registry [registry file] -assembly [GRCh37/GRCh38]

\t\toptions: -structvar  (only import ClinVar statuses for structural variations)
\t\toptions: -clean      ( delete old phenotype_feature, phenotype_feature_attrib, variation_set_variation (OMIM set), clinical_significance and synonym data)
\n\n";

}
